Make sure following packages are installed before compiling the RNotebook file.
if (!requireNamespace("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
library(BiocManager)
if (!requireNamespace("GEOmetadb", quietly = TRUE)) {
BiocManager::install("GEOmetadb")
}
library(GEOmetadb)
if (!requireNamespace("biomaRt", quietly = TRUE)) {
BiocManager::install("biomaRt")
}
library(biomaRt)
if (!requireNamespace("edgeR", quietly = TRUE)) {
install.packages("edgeR")
}
library(edgeR)
if (!requireNamespace("DBI", quietly = TRUE)) {
install.packages("DBI")
}
library(DBI)
if (!requireNamespace("limma", quietly = TRUE)) {
install.packages("limma")
}
library(limma)
if(!requireNamespace("knitr", quietly=TRUE)) {
install.packages("knitr")
}
library(knitr)
if(!requireNamespace("kableExtra", quietly=TRUE)) {
install.packages("kableExtra")
}
library(kableExtra)
if (!requireNamespace("ComplexHeatmap", quietly = TRUE)) {
install.packages("ComplexHeatmap")
}
library(ComplexHeatmap)
if (!requireNamespace("circlize", quietly = TRUE)) {
install.packages("circlize")
}
library(circlize)
if (!requireNamespace("gprofiler2", quietly = TRUE)) {
install.packages("gprofiler2")
}
library(gprofiler2)
Data set GSE89225 was selected, cleaned and normalized in assignment 1. The data set contains RNAseq data from normal breast parenchyma (NBP) and tumor tissues (Plitas et al. 2016). Following is the basic procedure of data manipulation we did.
# Get expression data
geo_id <- "GSE89225"
# Get expression data from GEOquery
sfiles <- GEOquery::getGEOSuppFiles(geo_id)
fnames <- rownames(sfiles)
# Read CSV and change column names
tcell_exp <- read.csv(fnames[1], header = TRUE, check.names = FALSE)
colnames(tcell_exp)[1] <- "ensembl_gene_id"
# Separate by "_" and define groups
samples <- data.frame(lapply(colnames(tcell_exp)[2:35],
FUN=function(x){unlist(strsplit(x, split = "_"))}))
colnames(samples) <- colnames(tcell_exp)[2:35]
rownames(samples) <- c("Tcell_type","tissue_type", "patients")
samples <- data.frame(t(samples))
# Output group table
samples[1:10, ] %>%
kableExtra::kbl(caption = "Table 1. Groups of Samples") %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| Tcell_type | tissue_type | patients | |
|---|---|---|---|
| tconv_tumor_patient10 | tconv | tumor | patient10 |
| cd177negtreg_tumor_patient10 | cd177negtreg | tumor | patient10 |
| cd177postreg_tumor_patient10 | cd177postreg | tumor | patient10 |
| tconv_tumor_patient1 | tconv | tumor | patient1 |
| treg_tumor_patient1 | treg | tumor | patient1 |
| tconv_NBP_patient2 | tconv | NBP | patient2 |
| treg_NBP_patient2 | treg | NBP | patient2 |
| tconv_tumor_patient2 | tconv | tumor | patient2 |
| cd177negtreg_tumor_patient2 | cd177negtreg | tumor | patient2 |
| cd177postreg_tumor_patient2 | cd177postreg | tumor | patient2 |
# Check for duplicated genes
summarized_gene_counts <- sort(table(tcell_exp$ensembl_gene_id), decreasing = TRUE)
# Remove invalid Ensenbl gene ids
valid_tcell_exp <- tcell_exp[grep("ENSG", tcell_exp$ensembl_gene_id), ]
# Translate out counts into counts per million
cpms <- edgeR::cpm(valid_tcell_exp[, 2:35])
rownames(cpms) <- valid_tcell_exp[, 1]
# Get rid of low counts
keep <- rowSums(cpms > 1) >= 10
filtered_tcell_exp <- valid_tcell_exp[keep, ]
# Connect to the desired mart
ensembl <- biomaRt::useMart("ensembl")
# Get the set of datasets availble
datasets <- biomaRt::listDatasets(ensembl)
# Limit to the human datasets availble
ensembl <- biomaRt::useDataset("hsapiens_gene_ensembl", mart=ensembl)
# Naming id conversion stash
conversion_stash <- "tcell_id_conversion.rds"
# Save the conversion
if (file.exists(conversion_stash)) {
tcell_id_conversion <- readRDS(conversion_stash)
} else {
tcell_id_conversion <- biomaRt::getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = c("ensembl_gene_id"),
values = filtered_tcell_exp$ensembl_gene_id,
mart = ensembl)
saveRDS(tcell_id_conversion, conversion_stash)
}
# Annotated version (merge)
annot_tcell_exp <- merge(tcell_id_conversion, filtered_tcell_exp, all.y = TRUE)
# Reassignment
original_tcell_exp <- annot_tcell_exp
# Output original data set table
original_tcell_exp[1:5,1:5] %>%
kableExtra::kbl(caption = "Table 2. Original Data Set") %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| ensembl_gene_id | hgnc_symbol | tconv_tumor_patient10 | cd177negtreg_tumor_patient10 | cd177postreg_tumor_patient10 |
|---|---|---|---|---|
| ENSG00000000003 | TSPAN6 | 510 | 935 | 1286 |
| ENSG00000000419 | DPM1 | 613 | 818 | 572 |
| ENSG00000000457 | SCYL3 | 269 | 587 | 364 |
| ENSG00000000460 | C1orf112 | 55 | 142 | 288 |
| ENSG00000000938 | FGR | 36 | 69 | 111 |
# Create an edgeR container for RNASeq count data
original_data_matrix <- as.matrix(original_tcell_exp[, 3:36])
rownames(original_data_matrix) <- original_tcell_exp$ensembl_gene_id
# Calculate the normalization factors
d <- edgeR::DGEList(counts = original_data_matrix, group = samples$tissue_type)
d <- edgeR::calcNormFactors(d)
# Normalize data set
normalized_tcell_exp <- edgeR::cpm(d)
normalized_tcell_exp <- cbind(original_tcell_exp[, 1:2], normalized_tcell_exp)
rownames(normalized_tcell_exp) <- NULL
# Remove duplicate HGNC symbols to avoid false assignment in the future
normalized_tcell_exp <- dplyr::distinct(normalized_tcell_exp, hgnc_symbol, .keep_all = TRUE)
# Output normalized data set table
normalized_tcell_exp[1:5,1:5] %>%
kableExtra::kbl(caption = "Table 3. Normalized Data Set") %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| ensembl_gene_id | hgnc_symbol | tconv_tumor_patient10 | cd177negtreg_tumor_patient10 | cd177postreg_tumor_patient10 |
|---|---|---|---|---|
| ENSG00000000003 | TSPAN6 | 45.054130 | 73.682958 | 102.619073 |
| ENSG00000000419 | DPM1 | 54.153298 | 64.462738 | 45.643942 |
| ENSG00000000457 | SCYL3 | 23.763845 | 46.258713 | 29.046145 |
| ENSG00000000460 | C1orf112 | 4.858779 | 11.190353 | 22.981565 |
| ENSG00000000938 | FGR | 3.180292 | 5.437566 | 8.857478 |
# Create a numerical heatmap matrix
heatmap_matrix <- normalized_tcell_exp[, 3:ncol(normalized_tcell_exp)]
rownames(heatmap_matrix) <- normalized_tcell_exp$ensembl_gene_id
colnames(heatmap_matrix) <- colnames(normalized_tcell_exp[, 3:ncol(normalized_tcell_exp)])
# Scale
heatmap_matrix <- t(scale(t(heatmap_matrix)))
# Decide the colour by range
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix), 0,
max(heatmap_matrix)), c("blue", "white", "red"))
# Create the heatmap
current_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix),
show_row_dend = TRUE,
show_column_dend = TRUE,
col=heatmap_col,
show_column_names = TRUE,
show_row_names = FALSE,
show_heatmap_legend = TRUE)
# Heatmap is too big for re-creating it every single time, so it is pre-created on local machine and saved in the folder
Figure 4. Initial Heatmap
The initial heatmap does not exhibit clear gene expression variations. Therefore, differential gene expression analysis is necessary.
# Plot MDS by tissue type
limma::plotMDS(normalized_tcell_exp[, 3:36],
labels = samples$tissue_type,
col = c("darkgreen", "red")[factor(samples$tissue_type)],
main = "Figure 5. MDS Plot by Tissue Type")
# Plot MDS by patient
limma::plotMDS(normalized_tcell_exp[, 3:36],
labels = samples$patient,
col = unlist(rainbow(10))[factor(samples$patient)],
main = "Figure 6. MDS Plot by Patient")
Multidimensional scaling (MDS) plots help us to examine the variation between tissue type and patients. The clustering of samples from the same patient is tighter than the clustering of samples from different patients. Consequently, it is important to consider the potential impact of patient variability on the presence of false-positive results.
From here, we will use the limma package to examine the
different between simple (tissue-only) model and patient
(tissue+patient) model.
# Create design matrix (tissue type only)
model_design <- model.matrix(~ samples$tissue_type)
# Create data matrix
expressionMatrix <- as.matrix(normalized_tcell_exp[, 3:36])
rownames(expressionMatrix) <- normalized_tcell_exp$ensembl_gene_id
colnames(expressionMatrix) <- colnames(normalized_tcell_exp)[3:36]
minimalSet <- Biobase::ExpressionSet(assayData = expressionMatrix)
# Fit data to model
fit <- limma::lmFit(minimalSet, model_design)
# The parameter trend = T is specfic to RNA-seq data
fit2 <- limma::eBayes(fit, trend = TRUE)
# Create topfit table
topfit <- limma::topTable(fit2,
coef = ncol(model_design),
adjust.method = "BH",
number = nrow(expressionMatrix))
# Merge hgnc names to topfit table
output_hits <- merge(normalized_tcell_exp[, 1:2],
topfit,
by.y = 0, by.x = 1,
all.y = TRUE)
output_hits_copy <- output_hits
# Sort by p-value
output_hits <- output_hits[order(output_hits$P.Value),]
# Output data fitted with tissue Type by limma (ordered by p-value)
output_hits[1:10, 2:8] %>%
kableExtra::kbl(caption = "Table 7. Fitted with Tissue Type by Limma (ordered by p-value)",
row.names = FALSE) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| hgnc_symbol | logFC | AveExpr | t | P.Value | adj.P.Val | B |
|---|---|---|---|---|---|---|
| SSBP2 | -65.345841 | 83.089167 | -5.548754 | 3.30e-06 | 0.0516639 | -0.1418015 |
| PLPBP | 39.327009 | 74.693203 | 5.217616 | 8.90e-06 | 0.0697884 | -0.4942265 |
| PER2 | -55.985955 | 66.537828 | -4.916120 | 2.18e-05 | 0.0917429 | -0.8240738 |
| ABCG1 | 18.304010 | 22.461308 | 4.894465 | 2.33e-05 | 0.0917429 | -0.8480566 |
| PLD1 | -10.039382 | 9.457198 | -4.742828 | 3.66e-05 | 0.1152740 | -1.0169774 |
| ZC3H3 | 4.339708 | 4.265111 | 4.584040 | 5.86e-05 | 0.1184221 | -1.1955240 |
| TRIM69 | 67.931973 | 79.345773 | 4.571694 | 6.08e-05 | 0.1184221 | -1.2094701 |
| TLE1 | -9.220237 | 6.179573 | -4.543103 | 6.61e-05 | 0.1184221 | -1.2417974 |
| ARHGAP5 | -39.317566 | 55.583784 | -4.467493 | 8.27e-05 | 0.1184221 | -1.3274954 |
| METRNL | 2.624850 | 2.740002 | 4.465095 | 8.32e-05 | 0.1184221 | -1.3302181 |
# How many genes pass the threshold p-value < 0.05?
length(which(output_hits$P.Value < 0.05))
## [1] 2043
# How many genes pass correction?
length(which(output_hits$adj.P.Val < 0.05))
## [1] 0
# Create design matrix (tissue type + patient)
model_design_pat <- model.matrix(~ samples$patients + samples$tissue_type)
# Fit data to model
fit_pat <- limma::lmFit(minimalSet, model_design_pat)
# The parameter trend = T is specfic to RNA-seq data
fit2_pat <- limma::eBayes(fit_pat, trend = TRUE)
# Create topfit table
topfit_pat <- limma::topTable(fit2_pat,
coef = ncol(model_design_pat),
adjust.method = "BH",
number = nrow(expressionMatrix))
# Merge hgnc names to topfit table
output_hits_pat <- merge(normalized_tcell_exp[, 1:2],
topfit_pat,
by.y = 0, by.x = 1,
all.y = TRUE)
# Sort by p-value
output_hits_pat <- output_hits_pat[order(output_hits_pat$P.Value),]
# Output data fitted with tissue type by limma (ordered by p-value)
output_hits_pat[1:10, 2:8] %>%
kableExtra::kbl(caption = "Table 8. Data Fitted with Tissue Type and Patient by Limma (ordered by p-value)",
row.names = FALSE) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| hgnc_symbol | logFC | AveExpr | t | P.Value | adj.P.Val | B |
|---|---|---|---|---|---|---|
| DNAJA1 | 227.439331 | 431.0982639 | 6.369439 | 0.0000011 | 0.0174358 | -0.8462367 |
| ZC3H3 | 4.566748 | 4.2651106 | 4.934135 | 0.0000434 | 0.2642690 | -1.7721030 |
| CA6 | -3.630443 | 2.4799616 | -4.791312 | 0.0000629 | 0.2642690 | -1.8762180 |
| TRIM69 | 58.955393 | 79.3457734 | 4.721288 | 0.0000755 | 0.2642690 | -1.9279988 |
| PER2 | -62.500636 | 66.5378281 | -4.501358 | 0.0001337 | 0.2642690 | -2.0936415 |
| POLD1 | 2.608097 | 1.7590874 | 4.466829 | 0.0001463 | 0.2642690 | -2.1200451 |
| ZNF114-AS1 | 1.177378 | 0.9962899 | 4.406475 | 0.0001711 | 0.2642690 | -2.1664448 |
| GSTM2 | -13.603408 | 12.1641462 | -4.361438 | 0.0001923 | 0.2642690 | -2.2012701 |
| CMTR1 | 21.357471 | 30.3903206 | 4.302490 | 0.0002241 | 0.2642690 | -2.2471034 |
| OASL | 134.813692 | 173.8437189 | 4.284223 | 0.0002350 | 0.2642690 | -2.2613627 |
# How many genes pass the threshold p-value < 0.05?
length(which(output_hits_pat$P.Value < 0.05))
## [1] 1712
# How many genes pass correction?
length(which(output_hits_pat$adj.P.Val < 0.05))
## [1] 1
# Compare the results from the two different models
simple_model_pvalues <- data.frame(hgnc_symbol = output_hits$hgnc_symbol,
simple_pvalue = output_hits$P.Value)
pat_model_pvalues <- data.frame(hgnc_symbol = output_hits_pat$hgnc_symbol,
patient_pvalue = output_hits_pat$P.Value)
# Merge two model
two_models_pvalues <- merge(simple_model_pvalues, pat_model_pvalues, by.x=1, by.y=1)
# Models' dot colours
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$patient_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$simple_pvalue < 0.05 &
two_models_pvalues$patient_pvalue<0.05] <- "red"
# Output comparison plot
plot(two_models_pvalues$simple_pvalue,
two_models_pvalues$patient_pvalue,
col = two_models_pvalues$colour,
xlab = "Simple model p-values",
ylab ="Patient model p-values",
main = "Figure 9. Simple vs Patient Limma")
# Plot legend
legend("topright",
legend=c("Simple", "Patient", "Both", "Not signif"),
fill = c("orange", "blue", "red", "black"))
# Set base colour
two_models_pvalues$colour <- "grey"
# Plot
plot(two_models_pvalues$simple_pvalue,
two_models_pvalues$patient_pvalue,
col = two_models_pvalues$colour,
xlab = "Simple model p-values",
ylab ="Patient model p-values",
main = "Figure 10. Simple vs Patient Limma")
# Highlight CCR genes
points(two_models_pvalues[which(grepl("CCR", two_models_pvalues$hgnc_symbol)), 2:3],
pch = 20, col = "red", cex = 1.5)
# Add Legend
legend("bottomright", legend=c("CCR", "rest"), fill = c("red", "grey"))
The comparison plot generated by both the complex and simple limma
models indicates a discrepancy between the two models, consistent with
the MDS plots presented earlier. Figure #. Simple vs Patient Limma
highlights the CCR genes referenced in the research, which are more
tightly clustered in the simple model compared to the complex model.
Therefore, I have opted to utilize the simple (tissue-only) model by
limma.
Package edgR provides quasi likelihood model for
analyzing the simple (tissue-only) scenario.
# Create design matrix
edgeR_model_design <- model.matrix(~ samples$tissue_type)
# Set edgeR object
edgeR_d <- edgeR::DGEList(counts = expressionMatrix,
group=samples$tissue_type)
# Estimate dispersion
edgeR_d <- edgeR::estimateDisp(edgeR_d, edgeR_model_design)
# Fit the Quasi liklihood model
edgeR_fit <- edgeR::glmQLFit(edgeR_d, edgeR_model_design)
edgeR_qlf <- edgeR::glmQLFTest(edgeR_fit, coef = 'samples$tissue_typetumor')
# Output fit table
edgeR::topTags(edgeR_qlf) %>%
kableExtra::kbl(caption = "Table 11. Data Fitted with Tissue Type by edgeR", row.names = FALSE) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
|
|
|
|
# Sort by p-values
qlf_output_hits <- edgeR::topTags(edgeR_qlf,
sort.by = "PValue",
n = nrow(normalized_tcell_exp))
# How many gene pass the threshold p-value < 0.05?
length(which(qlf_output_hits$table$PValue < 0.05))
## [1] 2421
# How many genes pass correction?
length(which(qlf_output_hits$table$FDR < 0.05))
## [1] 300
Compare the results from the two different methods.
# Manipulate QLF set to contain HGNC symbols as well
qlf_df <- data.frame(ensembl_gene_id = rownames(qlf_output_hits$table),
qlf_pvalue = qlf_output_hits$table$PValue)
qlf_df <- merge(tcell_id_conversion, qlf_df, by = "ensembl_gene_id", all.x = TRUE)
# Get only HGNC symbol and p-values
qlf_model_pvalues <- qlf_df[, 2:3]
limma_model_pvalues <- data.frame(hgnc_symbol = output_hits$hgnc_symbol,
limma_pvalue = output_hits$P.Value)
# Merge two models' p-values
two_models_pvalues <- merge(qlf_model_pvalues, limma_model_pvalues, by.x=1, by.y=1)
# Decide the colours of dots
two_models_pvalues$colour <- "black"
two_models_pvalues$colour[two_models_pvalues$qlf_pvalue < 0.05] <- "orange"
two_models_pvalues$colour[two_models_pvalues$limma_pvalue < 0.05] <- "blue"
two_models_pvalues$colour[two_models_pvalues$qlf_pvalue < 0.05 &
two_models_pvalues$qlf_pvalue < 0.05] <- "red"
# Plot the comparison
plot(two_models_pvalues$qlf_pvalue,
two_models_pvalues$limma_pvalue,
col = two_models_pvalues$colour,
xlab = "QLF simple model p-values",
ylab ="Limma simple model p-values",
main="Figure 12. QLF vs Limma by Tissue Type")
legend("topleft", legend=c("QLF", "Limma", "Both", "Not signif"),
fill = c("orange", "blue", "red", "black"))
# Set base colour
two_models_pvalues$colour <- "grey"
# Plot
plot(two_models_pvalues$qlf_pvalue,
two_models_pvalues$limma_pvalue,
col = two_models_pvalues$colour,
xlab = "QLF model p-values",
ylab = "Limma model p-values",
main = "Figure 13. QLF vs Limma by Tissue Type (CCR)")
# Highlight CCR points
points(two_models_pvalues[which(grepl("CCR",two_models_pvalues$hgnc_symbol)), 2:3],
pch = 20, col = "red", cex = 1.5)
# Add legend
legend("bottomright", legend = c("CCR", "rest"), fill = c("red", "grey"))
The QLF model in edgeR package identifies a greater
number of CCR genes as significant compared to the limma
model. Therefore, I prefer to use the QLF model by
edgeR.
# Clustering samples by sample type
tumor <- normalized_tcell_exp[, grepl("tumor", colnames(normalized_tcell_exp))]
NBP <- normalized_tcell_exp[, grepl("NBP", colnames(normalized_tcell_exp))]
exp_avg <- data.frame(hgnc_symbol = gsub("-", ".", normalized_tcell_exp[, 2]),
Tumor = rowMeans(tumor),
NBP = rowMeans(NBP))
# Construct the edgeR result table
qlf_result <- cbind(qlf_output_hits$table, ensembl_gene_id = rownames(qlf_output_hits$table))
qlf_result <- merge(tcell_id_conversion, qlf_result, by = "ensembl_gene_id", all.x = TRUE)
# Merge it with the fit result
merged_result <- merge(exp_avg, qlf_result, by.x = "hgnc_symbol", by.y = "hgnc_symbol", all = TRUE)
# Prepare coloring on the plots
status <- rep(0, nrow(merged_result))
status[merged_result$logFC < 0 & merged_result$PValue < 0.05] <- -1
status[merged_result$logFC > 0 & merged_result$PValue < 0.05] <- 1
# MA plots (points labelled with edgeR)
limma::plotMD(log2(merged_result[, c(2,3)]),
status = status,
values = c(-1, 1),
hl.col=c("blue", "red"),
main = "Figure 14. MA Plot")
The amount of differentially expressed genes was shown by Figure 14. MA Plot. Up-regulated genes are highlighted in red, and down-regulated genes are highlighted in blue.
# Rename heatmap matrix's rownames to HGNC symbols
rownames(heatmap_matrix) <- normalized_tcell_exp$hgnc_symbol
# Subset the threshold gene
top_hits <- qlf_result$hgnc_symbol[qlf_result$PValue < 0.05]
# Create top hit version of heatmap matrix
heatmap_matrix_tophits <- heatmap_matrix[which(rownames(heatmap_matrix) %in% top_hits),]
heatmap_matrix_tophits <- heatmap_matrix_tophits[, c(grep("tumor", colnames(heatmap_matrix_tophits)),
grep("NBP", colnames(heatmap_matrix_tophits)))]
# Decide the colour by range
heatmap_col = circlize::colorRamp2(c(min(heatmap_matrix_tophits),
0,
max(heatmap_matrix_tophits)),
c("blue", "white", "red"))
# Set colours
ha_colours <- c("red", "darkgreen")
names(ha_colours) <- c("tumor", "NBP")
# Set annotations
ha <- ComplexHeatmap::HeatmapAnnotation(df=data.frame(
type = rep(c("tumor", "NBP"), c(22, 12))),
col = list(type = ha_colours))
# Plot annotated heatmap
annotated_heatmap <- ComplexHeatmap::Heatmap(as.matrix(heatmap_matrix_tophits),
cluster_rows = TRUE,
cluster_columns = FALSE,
show_row_dend = TRUE,
show_column_dend = FALSE,
col = heatmap_col,
show_column_names = TRUE,
show_row_names = FALSE,
show_heatmap_legend = TRUE,
top_annotation = ha,
column_title = "Figure 15. Annotated Heatmap")
# Output annotated heatmap
annotated_heatmap
The annotated heatmap provides better clustering compared to Figure 4. Initial Heatmap. Firstly, some red clusters were observed in the top left region of the data under tumor conditions. Secondly, some red clusters were observed in the bottom right region of the data under the NBP condition. These observations suggest that there is a clear distinction between the NBP and tumor conditions based on the presence and location of the red clusters.
# How many genes are up regulated?
length(which(qlf_result$PValue < 0.05 & qlf_result$logFC > 0))
## [1] 1070
# How many genes are down regulated?
length(which(qlf_result$PValue < 0.05 & qlf_result$logFC < 0))
## [1] 1350
# Add rank column and order
qlf_result[, "rank"] <- -log(qlf_result$PValue, base = 10) * sign(qlf_result$logFC)
qlf_result <- qlf_result[order(qlf_result$rank), ]
# Divide into 3 lists
whole_list <- qlf_result$hgnc_symbol[which(qlf_result$PValue < 0.05)]
upregulated_genes <- qlf_result$hgnc_symbol[which(qlf_result$PValue < 0.05 & qlf_result$logFC > 0)]
downregulated_genes <- qlf_result$hgnc_symbol[which(qlf_result$PValue < 0.05 & qlf_result$logFC < 0)]
# Obtain version information
version_info <- gprofiler2::get_version_info(organism = "hsapiens")
version_df <- as.data.frame(unlist(version_info$sources))
names(version_df)[1] <- "version"
version_df[, "annotation_data"] <- row.names(version_df)
# Select GO Biological Process, KEGG, Reactome, and Wikipathways
version_df <- version_df[version_df$annotation_data %in% c("GO:BP.version", "KEGG.version", "REAC.version", "WP.version"), ]
# Create version table
version_df <- version_df[, c(2, 1)]
version_df$annotation_data <- c("GO:BP", "KEGG", "REAC", "WP")
row.names(version_df) <- 1:4
# Output version table
version_df %>%
kableExtra::kbl(caption = "Table 16. Versions of Annotation Data", ) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| annotation_data | version |
|---|---|
| GO:BP | annotations: BioMart classes: releases/2022-12-04 |
| KEGG | KEGG FTP Release 2022-12-26 |
| REAC | annotations: BioMart classes: 2022-12-28 |
| WP | 20221210 |
# ORA (all genes)
gost_all <- gprofiler2::gost(query = whole_list,
organism = "hsapiens",
sources = c("GO:BP", "KEGG", "REAC", "WP"),
significant = FALSE,
user_threshold = 0.05,
correction_method = "fdr",
exclude_iea = TRUE)
# Number of genesets returned (all genes)
nrow(gost_all$result)
## [1] 11446
# Figure 17. Manhattan plot (All Genes)
gprofiler2::gostplot(gost_all)
# Significant results (all genes)
gost_all_result <- gost_all$result[order(gost_all$result$p_value, decreasing = FALSE),]
sig_all <- gost_all_result[which(gost_all_result$source == "GO:BP"), ][1, ]
sig_all <- rbind(sig_all, gost_all_result[which(gost_all_result$source == "KEGG"), ][1, ])
sig_all <- rbind(sig_all, gost_all_result[which(gost_all_result$source == "REAC"), ][1, ])
sig_all <- rbind(sig_all, gost_all_result[which(gost_all_result$source == "WP"), ][1, ])
row.names(sig_all) <- c(1:4)
sig_all <- subset(sig_all, select = c(9:11))
# Output significant results (all genes)
sig_all %>%
kableExtra::kbl(caption = "Table 18. Significant Result of All Genes", ) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| term_id | source | term_name |
|---|---|---|
| GO:0034097 | GO:BP | response to cytokine |
| KEGG:05321 | KEGG | Inflammatory bowel disease |
| REAC:R-HSA-1280215 | REAC | Cytokine Signaling in Immune system |
| WP:WP619 | WP | Type II interferon signaling |
# ORA (up-regulated genes)
gost_up <- gprofiler2::gost(query = upregulated_genes,
organism = "hsapiens",
sources = c("GO:BP", "KEGG", "REAC", "WP"),
significant = FALSE,
user_threshold = 0.05,
correction_method = "fdr",
exclude_iea = TRUE)
# Number of genesets returned (up-regulated genes)
nrow(gost_up$result)
## [1] 8777
# Figure 19. Manhattan plot (Up-regulated Genes)
gprofiler2::gostplot(gost_up)
# Significant results (up-regulated genes)
gost_up_result <- gost_up$result[order(gost_up$result$p_value, decreasing = FALSE),]
sig_up <- gost_up_result[which(gost_up_result$source == "GO:BP"), ][1, ]
sig_up <- rbind(sig_up, gost_up_result[which(gost_up_result$source == "KEGG"), ][1, ])
sig_up <- rbind(sig_up, gost_up_result[which(gost_up_result$source == "REAC"), ][1, ])
sig_up <- rbind(sig_up, gost_up_result[which(gost_up_result$source == "WP"), ][1, ])
row.names(sig_up) <- c(1:4)
sig_up <- subset(sig_up, select = c(9:11))
# Output significant results (up-regulated genes)
sig_up %>%
kableExtra::kbl(caption = "Table 20. Significant Result of Up-regulated Genes", ) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| term_id | source | term_name |
|---|---|---|
| GO:0006955 | GO:BP | immune response |
| KEGG:05169 | KEGG | Epstein-Barr virus infection |
| REAC:R-HSA-1280215 | REAC | Cytokine Signaling in Immune system |
| WP:WP619 | WP | Type II interferon signaling |
# ORA (down-regulated genes)
gost_down <- gprofiler2::gost(query = downregulated_genes,
organism = "hsapiens",
sources = c("GO:BP", "KEGG", "REAC", "WP"),
significant = FALSE,
user_threshold = 0.05,
correction_method = "fdr",
exclude_iea = TRUE)
# Number of genesets returned (down-regulated genes)
nrow(gost_down$result)
## [1] 9164
# Figure 21. Manhattan plot (Down-regulated Genes)
gprofiler2::gostplot(gost_down)
# Significant results (down-regulated genes)
gost_down_result <- gost_down$result[order(gost_down$result$p_value, decreasing = FALSE),]
sig_down <- gost_down_result[which(gost_down_result$source == "GO:BP"), ][1, ]
sig_down <- rbind(sig_down, gost_down_result[which(gost_down_result$source == "KEGG"), ][1, ])
sig_down <- rbind(sig_down, gost_down_result[which(gost_down_result$source == "REAC"), ][1, ])
sig_down <- rbind(sig_down, gost_down_result[which(gost_down_result$source == "WP"), ][1, ])
row.names(sig_down) <- c(1:4)
sig_down <- subset(sig_down, select = c(9:11))
# Output significant results (down-regulated genes)
sig_down %>%
kableExtra::kbl(caption = "Table 22. Significant Result of Down-regulated Genes", ) %>%
kableExtra::kable_classic(full_width = F, html_font = "Cambria")
| term_id | source | term_name |
|---|---|---|
| GO:0002181 | GO:BP | cytoplasmic translation |
| KEGG:03010 | KEGG | Ribosome |
| REAC:R-HSA-156902 | REAC | Peptide chain elongation |
| WP:WP477 | WP | Cytoplasmic ribosomal proteins |
P-values of genes in my expression set are calculated in 2.3 Limma and 2.4 edgeR (Quasi Liklihood Model) under different models.
limmalimmaedgeRP-value < 0.05 is used as a threshold for statistical significance in Fisher’s exact test. Using a p-value < 0.05 is a common practice in many fields for several decades, and it is a widely accepted convention nowadays. Also, setting a threshold of 0.05 implies that we are interested in detecting relatively large effects, rather than small or trivial effects.
I used the Empirical Bayes model (eBayes) in
limma and Quasi Likelihood (QLF) model
(glmQLFit) in edgeR for multiple hypothesis
testing and correction. I found the number of genes with false discovery
rate (FDR) < 0.05 in each model. FDR of genes in my expression set
are calculated in 2.3 Limma and 2.4 edgeR (Quasi Liklihood
Model) under different model.
limmalimmaedgeRThe amount of differentially expressed genes was shown by Figure 14. MA Plot in 2.6 MA Plot. Up-regulated genes are highlighted in red, and down-regulated genes are highlighted in blue.
By comparing the Figure 4. Initial Heatmap in 2.1 Heatmap and Figure 15. Annotated Heatmap in 2.7 Annotated Heatmap, the conditions cluster together better. There are two conditions: NBP and tumor in the data set. Some discoveries:
By conclusion, the annotated heatmap that contains top hits provides a better visualization compared to the initial heatmap.
I chose to use g:Profiler and its gprofiler2 R package
because it is a very easy and useful annotating tool for homo sapiens
genomes.
I used GO:BP, KEGG, REAC and WP. Because these annotation data are updated regularly and widely used in researches. Check Table 16. Versions of Annotation Data in 3.2 G:Profiler.
11446 genesets were returned with a threshold of 0.05 for the whole gene list. More details can be found in 3.3 All Genes.
The up-regulated genes are more significantly marked (-log10(p-adj) > 16) in Figure 19. Manhattan plot (Up-regulated Genes) in 3.4 Up-regulated Genes, and the down-regulated genes are not significantly marked in Figure 21. Manhattan plot (Down-regulated Genes) in 3.5 Down-regulated Genes. By comparing the results between all genes (Table 18. Significant Result of All Genes in 3.3 All Genes) and up-regulated genes (Table 20. Significant Result of Up-regulated Genes in 3.4 Up-regulated Genes), two sets share many common results, such as “Cytokine Signaling in Immune system” and “Type II interferon signaling”.
The original paper states that “a number of cytokine and chemokine receptor genes, most notably CCR8, were upregulated in tumor-resident Treg cells in comparison to normal tissue resident ones.” in the summary (Plitas et al. 2016). The over-representation results support this conclusion. The up-regulated genes are more significantly marked, and CCR8 is found in the up-regulated gene list as expected.
In paper, The Role of Cytokines in Breast Cancer Development and Progression, it states that “cytokines involved in angiogenesis. The inflammatory infiltrate that is usually found in breast tumors produce IL-6, IL-1α, and IL-1β, which upregulate COX-2, which, in turn, increases VEGF expression in tumor cells promoting angiogenesis.” IL-1α (IL1A) can be found under the up-regulated gene list, and the paper “suggests that cytokines play an important role in the regulation of both induction and protection in breast cancer (Esquivel-Velázquez et al. 2015).”